{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Cross-validation on diabetes Dataset Exercise\n\n\nA tutorial exercise which uses cross-validation with linear models.\n\nThis exercise is used in the `cv_estimators_tut` part of the\n`model_selection_tut` section of the `stat_learn_tut_index`.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.linear_model import LassoCV\nfrom sklearn.linear_model import Lasso\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import GridSearchCV\n\nX, y = datasets.load_diabetes(return_X_y=True)\nX = X[:150]\ny = y[:150]\n\nlasso = Lasso(random_state=0, max_iter=10000)\nalphas = np.logspace(-4, -0.5, 30)\n\ntuned_parameters = [{'alpha': alphas}]\nn_folds = 5\n\nclf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False)\nclf.fit(X, y)\nscores = clf.cv_results_['mean_test_score']\nscores_std = clf.cv_results_['std_test_score']\nplt.figure().set_size_inches(8, 6)\nplt.semilogx(alphas, scores)\n\n# plot error lines showing +/- std. errors of the scores\nstd_error = scores_std / np.sqrt(n_folds)\n\nplt.semilogx(alphas, scores + std_error, 'b--')\nplt.semilogx(alphas, scores - std_error, 'b--')\n\n# alpha=0.2 controls the translucency of the fill color\nplt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)\n\nplt.ylabel('CV score +/- std error')\nplt.xlabel('alpha')\nplt.axhline(np.max(scores), linestyle='--', color='.5')\nplt.xlim([alphas[0], alphas[-1]])\n\n# #############################################################################\n# Bonus: how much can you trust the selection of alpha?\n\n# To answer this question we use the LassoCV object that sets its alpha\n# parameter automatically from the data by internal cross-validation (i.e. it\n# performs cross-validation on the training data it receives).\n# We use external cross-validation to see how much the automatically obtained\n# alphas differ across different cross-validation folds.\nlasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=10000)\nk_fold = KFold(3)\n\nprint(\"Answer to the bonus question:\",\n      \"how much can you trust the selection of alpha?\")\nprint()\nprint(\"Alpha parameters maximising the generalization score on different\")\nprint(\"subsets of the data:\")\nfor k, (train, test) in enumerate(k_fold.split(X, y)):\n    lasso_cv.fit(X[train], y[train])\n    print(\"[fold {0}] alpha: {1:.5f}, score: {2:.5f}\".\n          format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))\nprint()\nprint(\"Answer: Not very much since we obtained different alphas for different\")\nprint(\"subsets of the data and moreover, the scores for these alphas differ\")\nprint(\"quite substantially.\")\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}